For this analysis we collected tweets containing the set of public health keywords using the free Twitter Search API, which returns a random portion of matching tweets from the past seven days. This notebook does not include the collection process, but the analysis of the collected tweets.
language = 'en'
language_ref = { 'en' : { 'name' : 'English', 'min_coocurrence' : 10, 'min_coocurrence_hashtags' : 2},
'de' : { 'name' : 'German', 'min_coocurrence' : 1, 'min_coocurrence_hashtags' : 1},
'es' : { 'name' : 'Spanish', 'min_coocurrence' : 1, 'min_coocurrence_hashtags' : 1},
'fr' : { 'name' : 'French', 'min_coocurrence' : 1, 'min_coocurrence_hashtags' : 1},
'pt' : { 'name' : 'Portuguese', 'min_coocurrence' : 1, 'min_coocurrence_hashtags' : 1},
}
import os
tweets_folder = os.path.join("..", "data", "tweets", language)
tweets_climate_filename_prefix = "tweets_climate." + language + "."
tweets_health_filename_prefix = "tweets_health." + language + "."
tweets_climate = {}
tweets_health = {}
import pandas as pd
import csv
def load_tweets(filename):
tweets = {}
with open(filename) as csvfile:
reader = csv.DictReader(csvfile)
for row in reader:
tweets[row['id']] = row
return tweets
import time
from ast import literal_eval
from textblob import TextBlob
def get_location(tweet):
try:
user = eval(str(tweet['user']))
location = user['location']
except:
location = 'not_specified'
if location == '':
location = 'not_specified'
return location
def get_date(tweet):
timestamp = tweet['created_at']
try:
parsed_timestamp = time.strptime(timestamp, '%a %b %d %H:%M:%S %z %Y')
month_year = "%s %s" % (parsed_timestamp.tm_mon, parsed_timestamp.tm_year)
except:
month_year = 'missing'
return month_year
def get_text(tweet):
text = str(tweet['text'])
return text
def get_user(tweet):
try:
user = eval(str(tweet['user']))
username = user['screen_name']
except:
username = 'not_specified'
return "@" + username
def get_hashtags(tweet):
hashtags = []
try:
hashtags_entities = eval(str(tweet['entities']))['hashtags']
for hashtag_entity in hashtags_entities:
hashtags.append("#" + hashtag_entity['text'])
except:
pass
return hashtags
def get_user_mentions(tweet):
user_mentions = []
try:
user_mentions_entities = eval(str(tweet['entities']))['user_mentions']
for user_mention_entity in user_mentions_entities:
user_mentions.append("@" + user_mention_entity['screen_name'])
except:
pass
return user_mentions
def get_sentiment(tweet_text):
if language == 'en':
sentiment = TextBlob(tweet_text).sentiment
if sentiment.polarity >= 0.1:
polarity = 'positive'
else:
if sentiment.polarity < 0.1 and sentiment.polarity > -0.1:
polarity = 'neutral'
else:
polarity = 'negative'
if sentiment.subjectivity >= 0.5:
subjectivity = 'subjective'
else:
subjectivity = 'objective'
else:
polarity = 'no_polarity'
subjectivity = 'no_subjectivity'
return (polarity, subjectivity)
def process_tweet(tweet):
date = get_date(tweet)
location = get_location(tweet)
user = get_user(tweet)
text = get_text(tweet)
sentiment = get_sentiment(text)
hashtags = get_hashtags(tweet)
user_mentions = get_user_mentions(tweet)
return {'date' : date, 'location' : location, 'text' : text, 'hashtags' : hashtags, 'user' : user, 'sentiment' : sentiment, 'user_mentions' : user_mentions}
from os import listdir
files = listdir(tweets_folder)
for f in files:
if tweets_health_filename_prefix in f:
print("Loading %s" % f)
for (tweet_id, tweet) in load_tweets(os.path.join(tweets_folder, f)).items():
tweets_health[tweet_id] = process_tweet(tweet)
# if tweets_climate_filename_prefix in f:
# print("Loading %s" % f)
# for (tweet_id, tweet) in load_tweets(os.path.join(tweets_folder, f)).items():
# tweets_climate[tweet_id] = process_tweet(tweet)
print("Climate tweets: %d - Health tweets: %d" % (len(tweets_climate), len(tweets_health)))
df_tweets_climate = pd.DataFrame.from_dict(tweets_climate, orient='index')
df_tweets_climate.head(10)
df_tweets_health = pd.DataFrame.from_dict(tweets_health, orient='index')
df_tweets_health.head(10)
import json
import os
import re
keywords_file = os.path.join("..", "keywords", language + ".json")
climate_dict = []
health_dict = []
compound_terms = []
def normalise_keywords(dictionary): #lowercases and handles compounds
for i in range(0, len(dictionary)):
keyword = dictionary[i].lower()
compound = keyword.replace(' ','_')
if compound != keyword:
keyword = compound
words = tuple(compound.split('_'))
compound_terms.append(words)
dictionary[i] = keyword
return dictionary
def generate_hashtags(dictionary):
hashtags_dict = []
for keyword in dictionary:
hashtags_dict.append("#" + keyword.replace('_',''))
return hashtags_dict
with open(keywords_file) as f:
data = json.load(f)
climate_dict = normalise_keywords(data['climate'])
health_dict = normalise_keywords(data['health'])
climate_hashtag_dict = generate_hashtags(climate_dict)
health_hashtag_dict = generate_hashtags(health_dict)
health_dict
health_hashtag_dict
climate_dict
climate_hashtag_dict
location_threshold = 20
## Attempt to normalise location names by inferring the country when it's the last part of a location name.
## This is not used in the remaining of the notebook.
import re
place_country_regex = re.compile(r"^(?P<place>[^,]+?)[ ]*,[ ]*(?P<country>[^,$]+)[ \.]*$")
inferred_countries = {}
inferred_locations = {}
def infer_country(location):
match = place_country_regex.match(location)
if match:
inferred_countries[match.group('place')] = match.group('country')
inferred_countries[location] = match.group('country')
def get_inferred_location(location):
if location in inferred_countries.keys():
return inferred_countries[location]
else:
return location
for tweet in tweets_health.values():
infer_country(tweet['location'])
for tweet in tweets_health.values():
inferred_location = get_inferred_location(tweet['location'])
inferred_locations[inferred_location] = inferred_locations.get(inferred_location, 0) + 1
locations = {}
dates = {}
sentiments = {}
for tweet in tweets_health.values():
locations[tweet['location']] = locations.get(tweet['location'], 0) + 1
dates[tweet['date']] = dates.get(tweet['date'], 0) + 1
(polarity, subjectivity) = tweet['sentiment']
sentiments[polarity] = sentiments.get(polarity, 0) + 1
sentiments[subjectivity] = sentiments.get(subjectivity, 0) + 1
print("%d %d" % (len(locations.keys()), len(inferred_locations.keys())))
df_locations = pd.DataFrame(sorted(locations.items(), key=lambda k: k[1], reverse=True), columns=["Location", "Number of tweets"])
df_locations.loc[(df_locations["Number of tweets"] >= 100)]
df_dates = pd.DataFrame(sorted(dates.items(), key=lambda k: k[1], reverse=True), columns=["Date", "Number of tweets"])
df_dates
df_sentiments = pd.DataFrame(sorted(sentiments.items(), key=lambda k: k[1], reverse=True), columns=["Sentiment", "Number of tweets"])
df_sentiments
from nltk.tokenize import MWETokenizer, TweetTokenizer
import re
tweet_tokenizer = TweetTokenizer()
tokenizer = MWETokenizer(compound_terms)
per_location = {}
average_per_location = {}
proportion_per_location = {}
per_date = {}
average_per_date = {}
proportion_per_date = {}
per_sentiment = {}
average_per_sentiment = {}
proportion_per_sentiment = {}
per_user = {}
proportion_per_user = {}
histogram_number_of_mentions = {}
global_count_health_keywords = {}
global_count_climate_keywords = {}
global_count_health = 0
global_count_intersection = 0
global_health_contexts = []
global_health_hashtag_contexts = []
global_health_user_mentions_contexts = []
global_intersection_contexts = []
global_intersection_hashtag_contexts = []
global_intersection_user_mentions_contexts = []
cooccurrence_matrix = {}
hashtags_cooccurrence_matrix = {}
mixed_cooccurrence_matrix = {}
for termset in ["health", "intersection"]:
per_location[termset] = {}
average_per_location[termset] = {}
proportion_per_location[termset] = {}
per_date[termset] = {}
average_per_date[termset] = {}
proportion_per_date[termset] = {}
per_sentiment[termset] = {}
average_per_sentiment[termset] = {}
proportion_per_sentiment[termset] = {}
per_user[termset] = {}
proportion_per_user[termset] = {}
histogram_number_of_mentions[termset] = {}
for tweet in tweets_health.values():
text = tweet['text']
hashtags = tweet['hashtags']
location = tweet['location']
date = tweet['date']
(polarity, subjectivity) = tweet['sentiment']
user = tweet['user']
user_mentions = tweet['user_mentions']
wordlist = tweet_tokenizer.tokenize(text.lower())
compounds_wordlist = tokenizer.tokenize(wordlist)
filtered_compounds_wordlist = [w for w in compounds_wordlist if (len(w) > 3)]
health_contexts = []
total_intersection_mentions = 0
total_health_mentions = 0
health_words = []
climate_words = []
for word in filtered_compounds_wordlist:
if word in health_dict:
total_health_mentions += 1
context = filtered_compounds_wordlist
health_contexts.append(context)
global_health_contexts.extend(context)
global_health_hashtag_contexts.extend(hashtags)
global_health_user_mentions_contexts.extend(user_mentions)
global_count_health_keywords[word] = global_count_health_keywords.get(word, 0) + 1
health_words.append(word)
if word in climate_dict: # means intersection, since we are processing health tweets here
total_intersection_mentions += 1
context = filtered_compounds_wordlist
global_intersection_contexts.extend(context)
global_intersection_hashtag_contexts.extend(hashtags)
global_intersection_user_mentions_contexts.extend(user_mentions)
global_count_climate_keywords[word] = global_count_climate_keywords.get(word, 0) + 1
climate_words.append(word)
if (len(health_words) > 0) and (len(climate_words) > 0): #means intersection
for hword in health_words:
if hword not in cooccurrence_matrix.keys():
cooccurrence_matrix[hword] = {}
if hword not in mixed_cooccurrence_matrix.keys():
mixed_cooccurrence_matrix[hword] = {}
for hashtag in hashtags:
hashtag = hashtag.lower()
mixed_cooccurrence_matrix[hword][hashtag] = mixed_cooccurrence_matrix[hword].get(hashtag, 0) + 1
for cword in climate_words:
cooccurrence_matrix[hword][cword] = cooccurrence_matrix[hword].get(cword, 0) + 1
if cword not in mixed_cooccurrence_matrix.keys():
mixed_cooccurrence_matrix[cword] = {}
for hashtag in hashtags:
hashtag = hashtag.lower()
mixed_cooccurrence_matrix[cword][hashtag] = mixed_cooccurrence_matrix[cword].get(hashtag, 0) + 1
if total_intersection_mentions > 0:
for htag1 in hashtags:
htag1 = htag1.lower()
if htag1 not in hashtags_cooccurrence_matrix:
hashtags_cooccurrence_matrix[htag1] = {}
for htag2 in hashtags:
htag2 = htag2.lower()
if htag1 != htag2:
hashtags_cooccurrence_matrix[htag1][htag2] = hashtags_cooccurrence_matrix[htag1].get(htag2, 0) + 1
if total_health_mentions == 0:
# assuming all tweets collected using health keywords contain at least one health term
total_health_mentions = 1
context = filtered_compounds_wordlist
health_contexts.append(context)
global_health_contexts.extend(context)
global_health_hashtag_contexts.extend(hashtags)
global_health_user_mentions_contexts.extend(user_mentions)
global_count_health += total_health_mentions
global_count_intersection += total_intersection_mentions
histogram_number_of_mentions["health"][total_health_mentions] = histogram_number_of_mentions["health"].get(total_health_mentions, 0) + 1
histogram_number_of_mentions["intersection"][total_intersection_mentions] = histogram_number_of_mentions["intersection"].get(total_intersection_mentions, 0) + 1
if locations[location] >= location_threshold:
per_location["health"][location] = per_location["health"].get(location,0) + total_health_mentions
per_location["intersection"][location] = per_location["intersection"].get(location,0) + total_intersection_mentions
per_date["health"][date] = per_date["health"].get(date,0) + total_health_mentions
per_date["intersection"][date] = per_date["intersection"].get(date,0) + total_intersection_mentions
per_user["health"][user] = per_user["health"].get(user,0) + total_health_mentions
per_user["intersection"][user] = per_user["intersection"].get(user,0) + total_intersection_mentions
per_sentiment["health"][polarity] = per_sentiment["health"].get(polarity,0) + total_health_mentions
per_sentiment["intersection"][polarity] = per_sentiment["intersection"].get(polarity,0) + total_intersection_mentions
per_sentiment["health"][subjectivity] = per_sentiment["health"].get(subjectivity,0) + total_health_mentions
per_sentiment["intersection"][subjectivity] = per_sentiment["intersection"].get(subjectivity,0) + total_intersection_mentions
if total_health_mentions > 0:
if locations[location] >= location_threshold:
proportion_per_location["health"][location] = proportion_per_location["health"].get(location,0) + 1
proportion_per_date["health"][date] = proportion_per_date["health"].get(date,0) + 1
proportion_per_sentiment["health"][polarity] = proportion_per_sentiment["health"].get(polarity,0) + 1
proportion_per_sentiment["health"][subjectivity] = proportion_per_sentiment["health"].get(subjectivity,0) + 1
proportion_per_user["health"][user] = proportion_per_user["health"].get(user,0) + 1
if total_intersection_mentions > 0:
if locations[location] >= location_threshold:
proportion_per_location["intersection"][location] = proportion_per_location["intersection"].get(location,0) + 1
proportion_per_date["intersection"][date] = proportion_per_date["intersection"].get(date,0) + 1
proportion_per_sentiment["intersection"][polarity] = proportion_per_sentiment["intersection"].get(polarity,0) + 1
proportion_per_sentiment["intersection"][subjectivity] = proportion_per_sentiment["intersection"].get(subjectivity,0) + 1
proportion_per_user["intersection"][user] = proportion_per_user["intersection"].get(user,0) + 1
for location in locations.keys():
if locations[location] >= location_threshold:
average_per_location["health"][location] = per_location["health"][location]/locations[location]
average_per_location["intersection"][location] = per_location["intersection"][location]/locations[location]
proportion_per_location["health"][location] = proportion_per_location["health"].get(location,0)/locations[location] * 100
proportion_per_location["intersection"][location] = proportion_per_location["intersection"].get(location,0)/locations[location] * 100
for date in dates.keys():
average_per_date["health"][date] = per_date["health"][date]/dates[date]
average_per_date["intersection"][date] = per_date["intersection"][date]/dates[date]
proportion_per_date["health"][date] = proportion_per_date["health"].get(date,0)/dates[date] * 100
proportion_per_date["intersection"][date] = proportion_per_date["intersection"].get(date,0)/dates[date] * 100
for sentiment in sentiments.keys():
average_per_sentiment["health"][sentiment] = per_sentiment["health"][sentiment]/sentiments[sentiment]
average_per_sentiment["intersection"][sentiment] = per_sentiment["intersection"][sentiment]/sentiments[sentiment]
proportion_per_sentiment["health"][sentiment] = proportion_per_sentiment["health"].get(sentiment,0)/sentiments[sentiment] * 100
proportion_per_sentiment["intersection"][sentiment] = proportion_per_sentiment["intersection"].get(sentiment,0)/sentiments[sentiment] * 100
for user in proportion_per_user["health"].keys():
proportion_per_user["health"][user] = proportion_per_user["health"].get(user,0)/global_count_health * 100
for user in proportion_per_user["intersection"].keys():
proportion_per_user["intersection"][user] = proportion_per_user["intersection"].get(user,0)/global_count_intersection * 100
Each bar corresponds to number of tweets with x number of mentions. There are no tweets with 0 (zero) health mentions since tweets were collected using the keywords so they occur at least once in each tweet.
df_histogram_number_of_mentions = pd.DataFrame(data=histogram_number_of_mentions)
df_histogram_number_of_mentions.plot.bar(logy=True, figsize=(20,5))
import matplotlib.pyplot as plt
import networkx as nx
import numpy as np
G = nx.Graph()
health_nodes = []
climate_nodes = []
for word1 in cooccurrence_matrix.keys():
for word2 in cooccurrence_matrix[word1].keys():
if cooccurrence_matrix[word1][word2] > language_ref[language]['min_coocurrence']:
G.add_edge(word1, word2, weight=cooccurrence_matrix[word1][word2])
health_nodes.append(word1)
climate_nodes.append(word2)
plt.figure(figsize=(15,15))
pos = nx.spring_layout(G)
nx.draw_networkx_nodes(G, pos, nodelist=health_nodes, node_size=1000, node_color='b')
nx.draw_networkx_nodes(G, pos, nodelist=climate_nodes, node_size=1000, node_color='g')
nx.draw_networkx_edges(G, pos, edgelist=G.edges(data=True), width=2)
nx.draw_networkx_labels(G, pos, font_size=12, font_family='sans-serif')
plt.axis('off')
plt.show()
Only occurrences in intersection tweets are considered
G = nx.Graph()
health_nodes = []
climate_nodes = []
hashtag_nodes = []
for word in mixed_cooccurrence_matrix.keys():
for hashtag in mixed_cooccurrence_matrix[word].keys():
if mixed_cooccurrence_matrix[word][hashtag] > language_ref[language]['min_coocurrence']:
G.add_edge(word, hashtag, weight=mixed_cooccurrence_matrix[word][hashtag])
hashtag_nodes.append(hashtag)
if word in health_dict:
health_nodes.append(word)
if word in climate_dict:
climate_nodes.append(word)
plt.figure(figsize=(15,15))
pos = nx.spring_layout(G)
nx.draw_networkx_nodes(G, pos, nodelist=health_nodes, node_size=1000, node_color='b')
nx.draw_networkx_nodes(G, pos, nodelist=climate_nodes, node_size=1000, node_color='g')
nx.draw_networkx_nodes(G, pos, nodelist=hashtag_nodes, node_size=1000, node_color='r')
nx.draw_networkx_edges(G, pos, edgelist=G.edges(data=True), width=2)
nx.draw_networkx_labels(G, pos, font_size=12, font_family='sans-serif')
#edge_labels = nx.get_edge_attributes(G,'weight')
#nx.draw_networkx_edge_labels(G, pos, edge_labels=edge_labels, font_size=10, font_family='sans-serif')
plt.axis('off')
plt.show()
Only occurrences in intersection tweets are considered
G = nx.Graph()
nodes = []
for htag1 in hashtags_cooccurrence_matrix.keys():
for htag2 in hashtags_cooccurrence_matrix[htag1].keys():
if hashtags_cooccurrence_matrix[htag1][htag2] > language_ref[language]['min_coocurrence_hashtags']:
G.add_edge(htag1, htag2, weight=hashtags_cooccurrence_matrix[htag1][htag2])
nodes.append(htag1)
nodes.append(htag2)
plt.figure(figsize=(15,15))
pos = nx.spring_layout(G)
nx.draw_networkx_nodes(G, pos, nodelist=nodes, node_size=1000)
nx.draw_networkx_edges(G, pos, edgelist=G.edges(data=True), width=2)
nx.draw_networkx_labels(G, pos, font_size=12, font_family='sans-serif')
#edge_labels = nx.get_edge_attributes(G,'weight')
#nx.draw_networkx_edge_labels(G, pos, edge_labels=edge_labels, font_size=10, font_family='sans-serif')
plt.axis('off')
plt.show()
df_per_sentiment = pd.DataFrame(data=per_sentiment, index=["positive", "neutral", "negative", "objective", "subjective"])
df_per_sentiment
df_per_sentiment.T.filter(items=["health"],axis=0).plot.bar(figsize=(20,5))
df_per_sentiment.T.filter(items=["intersection"],axis=0).plot.bar(figsize=(20,5))
df_per_date = pd.DataFrame(data=per_date)
df_per_date
ax = df_per_date.plot.line(figsize=(15,5))
ax.set_xlabel("Date")
ax.set_ylabel("Total number of references")
ax
df_average_per_date = pd.DataFrame(data=average_per_date)
df_average_per_date
ax = df_average_per_date.plot.line(figsize=(15,5))
ax.set_xlabel("Date")
ax.set_ylabel("Average number of references")
ax
df_proportion_per_date = pd.DataFrame(data=proportion_per_date)
df_proportion_per_date
ax = df_proportion_per_date.plot.line(figsize=(15,5))
ax.set_xlabel("Date")
ax.set_ylabel("Proportion of tweets (%)")
ax.set_ylim(ymin=0)
ax
Due to the vast number of different locations defined by Twitter users, the tables show only locations for which there are references to health keywords, while the plots show only locations for which there are references to both health and climate keywords (intersection)
df_per_location = pd.DataFrame(data=per_location)
df_per_location.sort_values(by=['intersection'], ascending=False).head(100)
ax = df_per_location.sort_values(by=['intersection'], ascending=False).head(50).plot.bar(stacked=True,figsize=(15,5), logy=True)
ax.set_xlabel("Location")
ax.set_ylabel("Total number of references")
ax
df_average_per_location = pd.DataFrame(data=average_per_location)
df_average_per_location.sort_values(by=['intersection'], ascending=False).head(100)
ax = df_average_per_location.sort_values(by=['intersection'], ascending=False).head(50).plot.bar(stacked=True,figsize=(15,5))
ax.set_xlabel("Location")
ax.set_ylabel("Average number of references")
ax
df_proportion_per_location = pd.DataFrame(data=proportion_per_location)
df_proportion_per_location.sort_values(by=['intersection'], ascending=False).head(100)
ax = df_proportion_per_location.sort_values(by=['intersection'], ascending=False).head(50).plot.bar(figsize=(15,5))
ax.set_xlabel("Location")
ax.set_ylabel("Proportion of tweets (%)")
ax
df_per_user = pd.DataFrame(data=per_user)
df_per_user.sort_values(by=['intersection'], ascending=False).head(50)
df_proportion_per_user = pd.DataFrame(data=proportion_per_user)
df_proportion_per_user.sort_values(by=['intersection'], ascending=False).head(50)
ax = df_proportion_per_user.sort_values(by=['intersection'], ascending=False).head(50).plot.bar(figsize=(15,5))
ax.set_xlabel("User")
ax.set_ylabel("Proportion of tweets (%)")
ax
df_health_keywords = pd.DataFrame(sorted(global_count_health_keywords.items(), key=lambda k: k[1], reverse=True), columns=["Keyword", "Number of mentions"])
df_health_keywords
df_climate_keywords = pd.DataFrame(sorted(global_count_climate_keywords.items(), key=lambda k: k[1], reverse=True), columns=["Keyword", "Number of mentions"])
df_climate_keywords
Up to 200 most frequent words that appear in the context of our health or climate keywords or both
import matplotlib.pyplot as plt
import collections
from wordcloud import WordCloud, STOPWORDS
from stop_words import get_stop_words
threshold = 200
language_specific_stopwords = get_stop_words(language)
def create_wordcloud(contexts, stopwords=[]):
most_frequent_words = {}
stopwords.extend(STOPWORDS)
stopwords.extend(language_specific_stopwords)
context_unigrams = collections.Counter(contexts)
for word, freq in sorted(context_unigrams.items(), key=lambda k: k[1], reverse=True)[0:threshold]:
if word not in stopwords:
most_frequent_words[word] = freq
wordcloud = WordCloud(background_color="white", scale=10).generate_from_frequencies(most_frequent_words)
fig = plt.figure(1, figsize=(20, 12))
plt.axis('off')
plt.imshow(wordcloud, interpolation='bilinear')
plt.show()
create_wordcloud(global_health_contexts, health_dict + ['https'])
create_wordcloud(global_intersection_contexts, climate_dict + health_dict + ["https"])
create_wordcloud(global_intersection_contexts, ["https"])
lowercase_hashtags = []
for hashtag in global_health_hashtag_contexts:
lowercase_hashtags.append(hashtag.lower())
create_wordcloud(lowercase_hashtags, health_hashtag_dict + ["https"])
lowercase_hashtags = []
for hashtag in global_intersection_hashtag_contexts:
lowercase_hashtags.append(hashtag.lower())
create_wordcloud(lowercase_hashtags, ["https"])
create_wordcloud(global_health_user_mentions_contexts)
create_wordcloud(global_intersection_user_mentions_contexts)